library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd)  # install from source: https://github.com/PDiracDelta/CONSTANd/

This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.

In other notebooks in this series we have systematically varied components and observed how they affect the outcome of a DEA analysis. We have seen that medianSweeping normalization works does not work well for intensities on the original scale, and that CONSTANd does not work well on log2-trnsformed intensities. Here we compare medianSweeping on log2 scale, which we know does a good job, with CONSTANd on original intensity scale.

source('./other_functions.R')
source('./plotting_functions.R')

# you should either make a symbolic link in this directory
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# dat.w <- data.list$dat.w # data in wide format
if ('X' %in% colnames(dat.l)) { dat.l$X <- NULL }

# remove shared peptides
shared.peptides <- dat.l %>% filter(!shared.peptide)

# keep spectra with isolation interference <30 and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)

# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character

# which peptides were identified in each MS run?
unique.pep=dat.l %>% 
  group_by(Run) %>%
  distinct(Peptide) %>% 
  mutate(val=1)
unique.pep <- xtabs(val~Peptide+Run, data=unique.pep)
tmp <- apply(unique.pep, 1, function(x) all(x==1))
inner.peptides <- rownames(unique.pep)[tmp]
# specify # of varying component variants and their names
variant.names <- c('medianSweeping', 'CONSTANd')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw')  # ratios are considered raw, because they are basically mean-normalized intensities
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.colour <- tribble(
  ~Condition, ~Colour,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red' )
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Colour)
sample.info <- get_sample_info(dat.l, condition.colour)
channelNames <- remove_factors(unique(sample.info$Channel))

1 Unit scale component

Which scale are the reporter ion intensities on?

dat.unit.l <- emptyList(variant.names)

1.1 medianSweeping: log2 intensity

dat.unit.l$medianSweeping <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)

1.2 CONSTANd: original intensity

dat.unit.l$CONSTANd <- dat.l %>% rename(response=Intensity)

2 Normalization component

CONSTANd vs medianSweeping (in 2 steps)

# switch to wide format
dat.unit.w <- lapply(dat.unit.l, function(x) {
  pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
})
dat.norm.w <- emptyList(names(dat.unit.w))

2.1 medianSweeping (1)

# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w$medianSweeping <- dat.unit.w$medianSweeping
dat.norm.w$medianSweeping[,channelNames] <- dat.norm.w$medianSweeping[,channelNames] %>% sweep(1, apply(.[,channelNames], 1, median, na.rm=T))
dat.norm.w$medianSweeping
## # A tibble: 58,028 x 23
##    Mixture TechRepMixture Run   Protein Peptide    RT Charge PTM  
##    <fct>   <fct>          <fct> <fct>   <fct>   <dbl> <fct>  <fct>
##  1 Mixtur… 1              Mixt… P21291  [K].gF… 145.  2      N-Te…
##  2 Mixtur… 1              Mixt… P30511  [K].wA… 137.  2      N-Te…
##  3 Mixtur… 1              Mixt… P02787… [K].sA… 149.  3      N-Te…
##  4 Mixtur… 1              Mixt… Q9UBQ5  [K].iD… 202.  2      N-Te…
##  5 Mixtur… 1              Mixt… Q5H9R7  [R].tG… 107.  2      N-Te…
##  6 Mixtur… 1              Mixt… P43490  [K].nA…  99.8 3      N-Te…
##  7 Mixtur… 1              Mixt… P43897  [R].fE… 113.  2      N-Te…
##  8 Mixtur… 1              Mixt… Q6XQN6  [R].lQ… 190.  3      N-Te…
##  9 Mixtur… 1              Mixt… P49327  [R].dL… 215.  3      N-Te…
## 10 Mixtur… 1              Mixt… Q02790  [K].sN…  58.8 2      N-Te…
## # … with 58,018 more rows, and 15 more variables: TotalIntensity <dbl>,
## #   Ions.Score <int>, DeltaMZ <dbl>, isoInterOk <lgl>, noNAs <lgl>,
## #   onehit.protein <lgl>, shared.peptide <lgl>, `127C` <dbl>, `127N` <dbl>,
## #   `128C` <dbl>, `128N` <dbl>, `129C` <dbl>, `129N` <dbl>, `130C` <dbl>,
## #   `130N` <dbl>

2.2 CONSTANd

Now let’s apply CONSTANd to each Run separately, and then combine the results into a semi-wide dataframe again.

# dat.unit.l entries are in long format so all have same colnames and no channelNames
x.split <- split(dat.unit.w$CONSTANd, dat.unit.w$CONSTANd$Run)  # apply CONSTANd to each Run separately
x.split.norm  <- lapply(x.split, function(y) {
  y[,channelNames] <- CONSTANd(y[,channelNames])$normalized_data
  return(y)
})
dat.norm.w$CONSTANd <- bind_rows(x.split.norm)
dat.norm.w$CONSTANd
## # A tibble: 58,028 x 23
##    Mixture TechRepMixture Run   Protein Peptide    RT Charge PTM  
##    <fct>   <fct>          <fct> <fct>   <fct>   <dbl> <fct>  <fct>
##  1 Mixtur… 1              Mixt… Q86X55  [R].lL… 142.  2      N-Te…
##  2 Mixtur… 1              Mixt… P14868  [R].vT… 160.  3      N-Te…
##  3 Mixtur… 1              Mixt… P49736  [R].gL… 177.  3      N-Te…
##  4 Mixtur… 1              Mixt… P21333  [K].yG… 150.  2      N-Te…
##  5 Mixtur… 1              Mixt… O14745  [R].sV…  94.6 2      N-Te…
##  6 Mixtur… 1              Mixt… Q9H7Z7  [K].pN… 184.  3      N-Te…
##  7 Mixtur… 1              Mixt… Q99873  [K].wL… 190.  2      N-Te…
##  8 Mixtur… 1              Mixt… Q08379  [R].lE…  78.3 2      N-Te…
##  9 Mixtur… 1              Mixt… P62847  [K].tT… 195.  2      N-Te…
## 10 Mixtur… 1              Mixt… Q9UG63  [K].lV… 131.  2      N-Te…
## # … with 58,018 more rows, and 15 more variables: TotalIntensity <dbl>,
## #   Ions.Score <int>, DeltaMZ <dbl>, isoInterOk <lgl>, noNAs <lgl>,
## #   onehit.protein <lgl>, shared.peptide <lgl>, `127C` <dbl>, `127N` <dbl>,
## #   `128C` <dbl>, `128N` <dbl>, `129C` <dbl>, `129N` <dbl>, `130C` <dbl>,
## #   `130N` <dbl>

3 Summarization component: Median summarization

Summarize quantification values from PSM to peptide (first step) to protein (second step).

# normalized data
# group by (run,)protein,peptide then summarize twice (once on each level)
dat.norm.summ.w <- lapply(dat.norm.w, function(x) x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup() )

Notice that the row sums are not equal to Ncols anymore, because the median summarization does not preserve them (but mean summarization does).

4 Normalization component: medianSweeping (2)

# medianSweeping: in each channel, subtract median computed across all proteins within the channel
# do the above separately for each MS run
x.split <- split(dat.norm.summ.w$medianSweeping, dat.norm.summ.w$medianSweeping$Run)
x.split.norm  <- lapply( x.split, function(y) {
  y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median, na.rm=T) )
  return(y) } )
dat.norm.summ.w$medianSweeping <- bind_rows(x.split.norm)

5 QC plots

# make data completely wide (also across runs)

dat.norm.summ.w2 <- lapply( dat.norm.summ.w, function(x) x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )

5.1 Boxplots

# use (half-)wide format
par(mfrow=c(1,2))
for (i in seq_along(variant.names)) {
  boxplot_w(dat.norm.summ.w[[i]], sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1,1))

5.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
# use wide2 format
p <- emptyList(variant.names)
for (i in 1: n.comp.variants){
  p[[i]] <- maplot_ils(dat.norm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(p[[1]], p[[2]], ncol=2)

5.3 MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull
p <- emptyList(variant.names)
for (i in 1: n.comp.variants){
  p[[i]] <- maplot_ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale=scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(p[[1]], p[[2]], ncol=2)

5.4 CV (coefficient of variation) plots

dat.norm.summ.l <- lapply(dat.norm.summ.w, function(x){
  x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
  x <- to_long_format(x, sample.info)
})

par(mfrow=c(1, 2))
for (i in 1: n.comp.variants){
    cvplot_ils(dat=dat.norm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition',
               title=paste('Normalized', variant.names[i], sep='_'), abs=F)
}

par(mfrow=c(1, 1))

5.5 PCA plots

5.5.1 Using all proteins

par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.w2)){
  pcaplot_ils(dat.norm.summ.w2[[i]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))

5.5.2 Using spiked proteins only

par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.w2)){
    pcaplot_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))

5.6 HC (hierarchical clustering) plots

Only use spiked proteins

par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.w2)){
  dendrogram_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))

6 DEA component: Moderated t-test

NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).

design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
  this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
  d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[i]]), 'Protein')
  dat.dea[[i]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}

7 Results comparison

7.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm)
0.125
medianSweeping
CONSTANd
background spiked background spiked
not_DE 4061 4 4062 5
DE 3 15 2 14
0.125
medianSweeping CONSTANd
Accuracy 0.998 0.998
Sensitivity 0.789 0.737
Specificity 0.999 1.000
PPV 0.833 0.875
NPV 0.999 0.999
0.667
medianSweeping
CONSTANd
background spiked background spiked
not_DE 4062 17 4063 15
DE 2 2 1 4
0.667
medianSweeping CONSTANd
Accuracy 0.995 0.996
Sensitivity 0.105 0.211
Specificity 1.000 1.000
PPV 0.500 0.800
NPV 0.996 0.996
1
medianSweeping
CONSTANd
background spiked background spiked
not_DE 4060 5 4055 3
DE 4 14 9 16
1
medianSweeping CONSTANd
Accuracy 0.998 0.997
Sensitivity 0.737 0.842
Specificity 0.999 0.998
PPV 0.778 0.640
NPV 0.999 0.999

7.2 Scatter plots

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
q.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

scatterplot_ils(dat.dea, q.cols, 'q-values')

scatterplot_ils(dat.dea, logFC.cols, 'log2FC')

7.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins)
}

7.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per condition
dat.spiked.logfc <- lapply(dat.dea, function(x) x[spiked.proteins,logFC.cols])
dat.spiked.logfc.l <- lapply(dat.spiked.logfc, function(x) {
  x %>% rename_with(function(y) sapply(y, function(z) strsplit(z, '_')[[1]][2])) %>% pivot_longer(cols = everything(), names_to = 'condition', values_to = 'logFC') %>% add_column(Protein=rep(rownames(x), each=length(colnames(x)))) })
violinplot_ils(lapply(dat.spiked.logfc.l, filter, condition != referenceCondition))

8 Conclusions

9 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_BE.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_BE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_BE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] CONSTANd_0.99.4   forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2      
##  [5] purrr_0.3.4       readr_1.4.0       tidyr_1.1.2       tibble_3.0.4     
##  [9] tidyverse_1.3.0   psych_2.0.9       limma_3.45.19     kableExtra_1.3.1 
## [13] dendextend_1.14.0 gridExtra_2.3     stringi_1.5.3     ggplot2_3.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-150         fs_1.5.0             lubridate_1.7.9     
##  [4] webshot_0.5.2        httr_1.4.2           tools_4.0.3         
##  [7] backports_1.1.10     utf8_1.1.4           R6_2.4.1            
## [10] rpart_4.1-15         DBI_1.1.0            mgcv_1.8-33         
## [13] colorspace_1.4-1     nnet_7.3-14          withr_2.3.0         
## [16] tidyselect_1.1.0     mnormt_2.0.2         compiler_4.0.3      
## [19] cli_2.1.0            rvest_0.3.6          xml2_1.3.2          
## [22] labeling_0.4.2       scales_1.1.1         digest_0.6.27       
## [25] rmarkdown_2.5        pkgconfig_2.0.3      htmltools_0.5.0     
## [28] highr_0.8            dbplyr_1.4.4         rlang_0.4.8         
## [31] readxl_1.3.1         rstudioapi_0.11      farver_2.0.3        
## [34] generics_0.0.2       jsonlite_1.7.1       ModelMetrics_1.2.2.2
## [37] magrittr_1.5         Matrix_1.2-18        Rcpp_1.0.5          
## [40] munsell_0.5.0        fansi_0.4.1          viridis_0.5.1       
## [43] lifecycle_0.2.0      pROC_1.16.2          yaml_2.2.1          
## [46] MASS_7.3-53          plyr_1.8.6           recipes_0.1.14      
## [49] grid_4.0.3           blob_1.2.1           parallel_4.0.3      
## [52] crayon_1.3.4         lattice_0.20-41      haven_2.3.1         
## [55] splines_4.0.3        hms_0.5.3            tmvnsim_1.0-2       
## [58] knitr_1.30           pillar_1.4.6         stats4_4.0.3        
## [61] reshape2_1.4.4       codetools_0.2-16     reprex_0.3.0        
## [64] glue_1.4.2           evaluate_0.14        data.table_1.13.2   
## [67] modelr_0.1.8         vctrs_0.3.4          foreach_1.5.1       
## [70] cellranger_1.1.0     gtable_0.3.0         assertthat_0.2.1    
## [73] xfun_0.18            gower_0.2.2          prodlim_2019.11.13  
## [76] broom_0.7.2          e1071_1.7-4          class_7.3-17        
## [79] survival_3.2-7       viridisLite_0.3.0    timeDate_3043.102   
## [82] iterators_1.0.13     lava_1.6.8           ellipsis_0.3.1      
## [85] caret_6.0-86         ipred_0.9-9